library(ggplot2)
library(plotly)
## Warning: package 'plotly' was built under R version 4.4.3
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(viridis)
## Warning: package 'viridis' was built under R version 4.4.3
## Loading required package: viridisLite
library(kableExtra)
## Warning: package 'kableExtra' was built under R version 4.4.3
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:kableExtra':
## 
##     group_rows
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
datasets::airquality
##     Ozone Solar.R Wind Temp Month Day
## 1      41     190  7.4   67     5   1
## 2      36     118  8.0   72     5   2
## 3      12     149 12.6   74     5   3
## 4      18     313 11.5   62     5   4
## 5      NA      NA 14.3   56     5   5
## 6      28      NA 14.9   66     5   6
## 7      23     299  8.6   65     5   7
## 8      19      99 13.8   59     5   8
## 9       8      19 20.1   61     5   9
## 10     NA     194  8.6   69     5  10
## 11      7      NA  6.9   74     5  11
## 12     16     256  9.7   69     5  12
## 13     11     290  9.2   66     5  13
## 14     14     274 10.9   68     5  14
## 15     18      65 13.2   58     5  15
## 16     14     334 11.5   64     5  16
## 17     34     307 12.0   66     5  17
## 18      6      78 18.4   57     5  18
## 19     30     322 11.5   68     5  19
## 20     11      44  9.7   62     5  20
## 21      1       8  9.7   59     5  21
## 22     11     320 16.6   73     5  22
## 23      4      25  9.7   61     5  23
## 24     32      92 12.0   61     5  24
## 25     NA      66 16.6   57     5  25
## 26     NA     266 14.9   58     5  26
## 27     NA      NA  8.0   57     5  27
## 28     23      13 12.0   67     5  28
## 29     45     252 14.9   81     5  29
## 30    115     223  5.7   79     5  30
## 31     37     279  7.4   76     5  31
## 32     NA     286  8.6   78     6   1
## 33     NA     287  9.7   74     6   2
## 34     NA     242 16.1   67     6   3
## 35     NA     186  9.2   84     6   4
## 36     NA     220  8.6   85     6   5
## 37     NA     264 14.3   79     6   6
## 38     29     127  9.7   82     6   7
## 39     NA     273  6.9   87     6   8
## 40     71     291 13.8   90     6   9
## 41     39     323 11.5   87     6  10
## 42     NA     259 10.9   93     6  11
## 43     NA     250  9.2   92     6  12
## 44     23     148  8.0   82     6  13
## 45     NA     332 13.8   80     6  14
## 46     NA     322 11.5   79     6  15
## 47     21     191 14.9   77     6  16
## 48     37     284 20.7   72     6  17
## 49     20      37  9.2   65     6  18
## 50     12     120 11.5   73     6  19
## 51     13     137 10.3   76     6  20
## 52     NA     150  6.3   77     6  21
## 53     NA      59  1.7   76     6  22
## 54     NA      91  4.6   76     6  23
## 55     NA     250  6.3   76     6  24
## 56     NA     135  8.0   75     6  25
## 57     NA     127  8.0   78     6  26
## 58     NA      47 10.3   73     6  27
## 59     NA      98 11.5   80     6  28
## 60     NA      31 14.9   77     6  29
## 61     NA     138  8.0   83     6  30
## 62    135     269  4.1   84     7   1
## 63     49     248  9.2   85     7   2
## 64     32     236  9.2   81     7   3
## 65     NA     101 10.9   84     7   4
## 66     64     175  4.6   83     7   5
## 67     40     314 10.9   83     7   6
## 68     77     276  5.1   88     7   7
## 69     97     267  6.3   92     7   8
## 70     97     272  5.7   92     7   9
## 71     85     175  7.4   89     7  10
## 72     NA     139  8.6   82     7  11
## 73     10     264 14.3   73     7  12
## 74     27     175 14.9   81     7  13
## 75     NA     291 14.9   91     7  14
## 76      7      48 14.3   80     7  15
## 77     48     260  6.9   81     7  16
## 78     35     274 10.3   82     7  17
## 79     61     285  6.3   84     7  18
## 80     79     187  5.1   87     7  19
## 81     63     220 11.5   85     7  20
## 82     16       7  6.9   74     7  21
## 83     NA     258  9.7   81     7  22
## 84     NA     295 11.5   82     7  23
## 85     80     294  8.6   86     7  24
## 86    108     223  8.0   85     7  25
## 87     20      81  8.6   82     7  26
## 88     52      82 12.0   86     7  27
## 89     82     213  7.4   88     7  28
## 90     50     275  7.4   86     7  29
## 91     64     253  7.4   83     7  30
## 92     59     254  9.2   81     7  31
## 93     39      83  6.9   81     8   1
## 94      9      24 13.8   81     8   2
## 95     16      77  7.4   82     8   3
## 96     78      NA  6.9   86     8   4
## 97     35      NA  7.4   85     8   5
## 98     66      NA  4.6   87     8   6
## 99    122     255  4.0   89     8   7
## 100    89     229 10.3   90     8   8
## 101   110     207  8.0   90     8   9
## 102    NA     222  8.6   92     8  10
## 103    NA     137 11.5   86     8  11
## 104    44     192 11.5   86     8  12
## 105    28     273 11.5   82     8  13
## 106    65     157  9.7   80     8  14
## 107    NA      64 11.5   79     8  15
## 108    22      71 10.3   77     8  16
## 109    59      51  6.3   79     8  17
## 110    23     115  7.4   76     8  18
## 111    31     244 10.9   78     8  19
## 112    44     190 10.3   78     8  20
## 113    21     259 15.5   77     8  21
## 114     9      36 14.3   72     8  22
## 115    NA     255 12.6   75     8  23
## 116    45     212  9.7   79     8  24
## 117   168     238  3.4   81     8  25
## 118    73     215  8.0   86     8  26
## 119    NA     153  5.7   88     8  27
## 120    76     203  9.7   97     8  28
## 121   118     225  2.3   94     8  29
## 122    84     237  6.3   96     8  30
## 123    85     188  6.3   94     8  31
## 124    96     167  6.9   91     9   1
## 125    78     197  5.1   92     9   2
## 126    73     183  2.8   93     9   3
## 127    91     189  4.6   93     9   4
## 128    47      95  7.4   87     9   5
## 129    32      92 15.5   84     9   6
## 130    20     252 10.9   80     9   7
## 131    23     220 10.3   78     9   8
## 132    21     230 10.9   75     9   9
## 133    24     259  9.7   73     9  10
## 134    44     236 14.9   81     9  11
## 135    21     259 15.5   76     9  12
## 136    28     238  6.3   77     9  13
## 137     9      24 10.9   71     9  14
## 138    13     112 11.5   71     9  15
## 139    46     237  6.9   78     9  16
## 140    18     224 13.8   67     9  17
## 141    13      27 10.3   76     9  18
## 142    24     238 10.3   68     9  19
## 143    16     201  8.0   82     9  20
## 144    13     238 12.6   64     9  21
## 145    23      14  9.2   71     9  22
## 146    36     139 10.3   81     9  23
## 147     7      49 10.3   69     9  24
## 148    14      20 16.6   63     9  25
## 149    30     193  6.9   70     9  26
## 150    NA     145 13.2   77     9  27
## 151    14     191 14.3   75     9  28
## 152    18     131  8.0   76     9  29
## 153    20     223 11.5   68     9  30
?airquality
## starting httpd help server ...
##  done

airquality is a dataset containing daily air quality measurements in New York from May to September of 1973. The dataset contains the day of the month, the month, the temperature (in degrees Fahrenheit), wind speed (miles per hour), solar radiation (Langleys) and ozone concentration (ppb). The ozone data is obtained from the New York state department of conservation and the meteorological data comes from the National Weather service.

Upon first inspection of the data - see the 3D scatter plot - I’m interested in figuring out to what extent the ozone concentration can be explained by variations in both temperature and solar radiation.

Plot 1 - 3D plot containing temperature, ozone, solar radiation and the months

plot_ly(airquality, x = ~Temp, y = ~Ozone, z = ~Solar.R, 
        type = "scatter3d", mode = "markers", color = ~Month,
        colors = colorRamp(c('blue', 'red'))) %>%
  layout(title = "3D Scatter Plot of Ozone, Temperature, and Solar Radiation",
         scene = list(xaxis = list(title = 'Temperature (°F)'),
                      yaxis = list(title = 'Ozone (ppb)'),
                      zaxis = list(title = 'Solar Radiation (watt/m²)')))
## Warning: Ignoring 42 observations

This plot is purely used to inspect the data. Obviously, lower temperatures tend to respond to days earlier in the season. In general, it looks like the higher the solar radiation and the higher the temperature, the higher the ozone concentration.

# Clean the dataset
airquality <- airquality %>% 
  filter(!is.na(Ozone) & !is.na(Temp))

#Linear regression
# Model 1: Ozone vs Temperature
model_temp <- lm(Ozone ~ Temp, data = airquality)
# Model 2: Ozone vs Solar Radiation
model_solar <- lm(Ozone ~ Solar.R, data = airquality)
# Model 3: Ozone vs Temperature and Solar Radiation
model_combined <- lm(Ozone ~ Temp + Solar.R, data = airquality)

# Summarizing the models
summary(model_temp)
## 
## Call:
## lm(formula = Ozone ~ Temp, data = airquality)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -40.729 -17.409  -0.587  11.306 118.271 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -146.9955    18.2872  -8.038 9.37e-13 ***
## Temp           2.4287     0.2331  10.418  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 23.71 on 114 degrees of freedom
## Multiple R-squared:  0.4877, Adjusted R-squared:  0.4832 
## F-statistic: 108.5 on 1 and 114 DF,  p-value: < 2.2e-16
summary(model_solar)
## 
## Call:
## lm(formula = Ozone ~ Solar.R, data = airquality)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -48.292 -21.361  -8.864  16.373 119.136 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 18.59873    6.74790   2.756 0.006856 ** 
## Solar.R      0.12717    0.03278   3.880 0.000179 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 31.33 on 109 degrees of freedom
##   (5 observations deleted due to missingness)
## Multiple R-squared:  0.1213, Adjusted R-squared:  0.1133 
## F-statistic: 15.05 on 1 and 109 DF,  p-value: 0.0001793
summary(model_combined)
## 
## Call:
## lm(formula = Ozone ~ Temp + Solar.R, data = airquality)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -36.610 -15.976  -2.928  12.371 115.555 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -145.70316   18.44672  -7.899 2.53e-12 ***
## Temp           2.27847    0.24600   9.262 2.22e-15 ***
## Solar.R        0.05711    0.02572   2.221   0.0285 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 23.5 on 108 degrees of freedom
##   (5 observations deleted due to missingness)
## Multiple R-squared:  0.5103, Adjusted R-squared:  0.5012 
## F-statistic: 56.28 on 2 and 108 DF,  p-value: < 2.2e-16
# Extracting R-squared values and AIC
rsq_temp <- summary(model_temp)$r.squared
rsq_solar <- summary(model_solar)$r.squared
rsq_combined <- summary(model_combined)$r.squared
aic_temp <- AIC(model_temp)
aic_solar <- AIC(model_solar)
aic_combined <- AIC(model_combined)

# Create a summary table
model_comparison <- data.frame(
  Model = c("Ozone ~ Temp", "Ozone ~ Solar.R", "Ozone ~ Temp + Solar.R"),
  R_squared = c(rsq_temp, rsq_solar, rsq_combined),
  AIC = c(aic_temp, aic_solar, aic_combined)
)

# Plot 2 + 3 
ggplot(airquality, aes(x = Solar.R, y = Ozone, color = Month)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE, color = "black") +
  scale_color_viridis_c(option = "C", name = "Month") +
  labs(title = "Solar Radiation vs. Ozone Concentration by Month (with Regression)",
       x = "Solar Radiation (Langleys)", y = "Ozone (ppb)") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 5 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 5 rows containing missing values or values outside the scale range
## (`geom_point()`).

# Add residuals and identify outliers for the temp vs ozone plot
airquality_outliers <- airquality %>%
  mutate(
    Residuals = resid(model_temp),
    Std_Residuals = rstandard(model_temp),
    Outlier = abs(Std_Residuals) > 2
  )

# Plot with outliers highlighted
ggplot(airquality_outliers, aes(x = Temp, y = Ozone)) +
  # Regular points
  geom_point(data = subset(airquality_outliers, !Outlier), color = "blue", alpha = 0.6) +
  geom_point(data = subset(airquality_outliers, Outlier), color = "red", alpha = 0.6) +
  # Regression line
  geom_smooth(method = "lm", se = FALSE, color = "black") +
  labs(title = "Temperature vs. Ozone Concentration (with Regression and Outliers)",
       x = "Temperature (°F)", y = "Ozone (ppb)") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

These plots all show some form of correlation between the variables of interest. The solar radiation vs ozone concentration by month shows that although a high solar radiation does not necessarily mean a high ozone concentration, all high ozone concentrations do have a high(er) solar radiation. The temperature vs ozone concentration shows - apart from some high ozone outliers - a rather linear trend: the higher the temperature, the higher the ozone concentration. This makes the linear regression interesting.

# Prepare data grid for predictions
new_data <- expand.grid(
  Temp = seq(min(airquality$Temp, na.rm = TRUE), max(airquality$Temp, na.rm = TRUE), length.out = 100),
  Solar.R = seq(min(airquality$Solar.R, na.rm = TRUE), max(airquality$Solar.R, na.rm = TRUE), length.out = 100)
)

# Predict Ozone based on the combined model
new_data$Ozone_Pred <- predict(model_combined, newdata = new_data)

# Plotting the contour with observed data points
# Combine predicted and observed ozone values for a unified color scale
ozone_range <- range(c(new_data$Ozone_Pred, airquality$Ozone), na.rm = TRUE)

ggplot() +
  # Contour of predicted ozone
  geom_tile(data = new_data, aes(x = Temp, y = Solar.R, fill = Ozone_Pred)) +
  geom_contour(data = new_data, aes(x = Temp, y = Solar.R, z = Ozone_Pred), color = "white") +
  # Observed data points with black border
  geom_point(data = airquality, aes(x = Temp, y = Solar.R, fill = Ozone), 
             shape = 21, color = "black", size = 3, stroke = 1, alpha = 0.8) +
  # Unified color scale for both predicted and observed ozone
  scale_fill_viridis_c(name = "Ozone (ppb)", limits = ozone_range, option = "C") +
  # Labels and theme
  labs(
    title = "Observed vs. Predicted Ozone Concentration",
    x = "Temperature (°F)",
    y = "Solar Radiation (Langleys)"
  ) +
  theme_minimal()
## Warning: Removed 5 rows containing missing values or values outside the scale range
## (`geom_point()`).

print(model_comparison)
##                    Model R_squared      AIC
## 1           Ozone ~ Temp 0.4877072 1067.706
## 2        Ozone ~ Solar.R 0.1213419 1083.714
## 3 Ozone ~ Temp + Solar.R 0.5103167 1020.820

From this plot you can see that for low ozone concentrations, the model performs not too bad. However, for higher ozone concentrations, the model severely underestimates the ozone concentration, as the dotted observations are much brighter than the modelled background color. Also, the highest observed ozone concentrations don’t show up in the most upper right corner.

kable(model_comparison, format = "html", caption = "Model Comparison Table") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = FALSE)
Model Comparison Table
Model R_squared AIC
Ozone ~ Temp 0.4877072 1067.706
Ozone ~ Solar.R 0.1213419 1083.714
Ozone ~ Temp + Solar.R 0.5103167 1020.820